Feature generation

Using the kiln pressure data we can generate features to aid in concreteley determining whether certain patterns produce better yields. Some of the features discussed include:

  • Area between setpoint and average kiln temperature (completed)
  • Time spent at positive pressure between 0 and 1000 minutes (replaced)
  • During the time when the average kiln temperature <= 1200°F:
    • Duration at positive *pressure (pressure >= 0)
    • Duration at negative *pressure (pressure < 0)
    • Duration when (0.01 <= pressure <= 0.03)
    • Standard deviation of pressure
    • Mean of pressure
    • Add weather data once again?

Area under the curve was originally going to be calculated but after some thought, time spent within the pressure ranges seemed a more useful metric. Could potentially revisit an AUC calculation later on.

press_calcs <- kilns_All %>% 
  group_by(LOTNO) %>% 
  mutate(
    
    # adust pressure max and min 
    pressure = case_when(pressure > .07 ~ .07,
                         pressure < -.07 ~ -.07,
                         TRUE ~ pressure
                         ),
    
    # find time index where temp reaches 1200F
    close_1200  = if_else( (time < index_max_temp), (abs(1200 - avg_kiln_temp)), NULL ))

# join time index to original
press_calcs <- left_join(press_calcs, press_calcs %>% 
                           group_by(LOTNO) %>% 
                           dplyr::summarise(index_1200F = which.min(close_1200))
                         )

press_calcs <- press_calcs %>% 
  group_by(LOTNO) %>% 
  mutate(
    
    # segregate pressures into positive and negative columns
    press_pos = if_else((pressure >= 0) & (time <= index_1200F), pressure, NULL),
    press_neg = if_else((pressure <  0) & (time <= index_1200F), pressure, NULL),
    # get all pressures below 1200F point for easy mean, sd calcs
    press_1200F = if_else(time <= index_1200F, pressure, NULL),

    # segregate pressures into range columns
    press_greater_03 = if_else((pressure >= .03) &                    (time <= index_1200F), pressure, NULL),
    press_betw_02_03 = if_else((pressure >= .02) & (pressure < .03) & (time <= index_1200F), pressure, NULL),
    press_betw_01_02 = if_else((pressure >= .01) & (pressure < .02) & (time <= index_1200F), pressure, NULL),
    press_betw_00_01 = if_else((pressure >=  .0) & (pressure < .01) & (time <= index_1200F), pressure, NULL),
    press_less_00    = if_else((pressure <    0) &                    (time <= index_1200F), pressure, NULL),
    
    # add counter columns for easy aggregation of total times spent in range
    press_pos_count = if_else(!is.na(press_pos),1,0),
    press_neg_count = if_else(!is.na(press_neg),1,0),
    
    press_greater_03_count = if_else(!is.na(press_greater_03),1,0),
    press_betw_02_03_count = if_else(!is.na(press_betw_02_03),1,0),
    press_betw_01_02_count = if_else(!is.na(press_betw_01_02),1,0),
    press_betw_00_01_count = if_else(!is.na(press_betw_00_01),1,0),
    press_less_00_count    = if_else(!is.na(press_less_00),1,0)
    
  ) %>% 
  # remove helper columns
  dplyr::select(-c(close_1200)) %>% 
  dplyr::select(c(LOTNO,index_max_temp,index_1200F,everything()))

# summarise times at pressures
press_summ <- press_calcs %>% 
  group_by(LOTNO) %>% 
  dplyr::summarise(

    # sum times at negative and positive pressure
    time_at_pos = sum(press_pos_count, na.rm=TRUE),
    time_at_neg = sum(press_neg_count, na.rm=TRUE),
    
    # get means and sd of pressures
    mean_press  = mean(press_1200F, na.rm=TRUE),
    sd_press    = sd(press_1200F,   na.rm=TRUE),

    # sum times at pressure ranges
    time_greater_03 = sum(press_greater_03_count, na.rm=TRUE),
    time_betw_02_03 = sum(press_betw_02_03_count, na.rm=TRUE),
    time_betw_01_02 = sum(press_betw_01_02_count, na.rm=TRUE),
    time_betw_00_01 = sum(press_betw_00_01_count, na.rm=TRUE),
    time_less_00    = sum(press_less_00_count,    na.rm=TRUE)
  )

# join summarised features to original pressure data for easy plotting and verification
press_joined <- left_join(press_calcs, press_summ) %>% 
  mutate(
    pct_time_at_pos     = time_at_pos/index_1200F,
    pct_time_at_neg     = time_at_neg/index_1200F,
    pct_time_greater_03 = time_greater_03/index_1200F,
    pct_time_betw_02_03 = time_betw_02_03/index_1200F,
    pct_time_betw_01_02 = time_betw_01_02/index_1200F,
    pct_time_betw_00_01 = time_betw_00_01/index_1200F,
    pct_time_less_00    = time_less_00/index_1200F
  ) %>% 
  # remove helper columns
  dplyr::select(-c(press_pos_count,
                   press_neg_count,
                   press_greater_03_count,
                   press_betw_02_03_count,
                   press_betw_01_02_count,
                   press_betw_00_01_count,
                   press_less_00_count,
                   press_1200F
  ))

# attached aucDiff to each df
press_summ   <- left_join(press_summ, allAucValues)
press_joined <- left_join(press_joined, allAucValues)

# add kiln column for quick filtering
press_summ <- press_summ %>%
  mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) 

# add weather data as well..
press_summ <- df_yields %>% 
  group_by(LOTNO) %>% slice(1) %>% ungroup() %>% 
  dplyr::select(LOTNO, precip:temp_avg) %>% 
  right_join(press_summ)

# prepare lot yield metric
# get lot yields by summarising fired and rejects
lot_yields <- 
  df_yields %>% 
    group_by(LOTNO) %>% 
    dplyr::summarise(
      lot_items_fired = sum(TOTAL_ITEM_FIRED),
      lot_items_rejected = sum(TOTAL_ITEM_REJECTED),
      lot_yield = (lot_items_fired - lot_items_rejected) / lot_items_fired
    ) %>% 
    dplyr::select(LOTNO, lot_yield)

# join yields with features, remove 20 missing lot yield values
press_summ <- left_join(press_summ, lot_yields) %>% 
  na.omit()

#
# significance function
cor.mtest <- function(mat, ...) {
    mat <- as.matrix(mat)
    n <- ncol(mat)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(mat[, i], mat[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}

Visualize features

Quick visualization of randomly selected lots to verify that metrics seem to be produced correctly.

1

# sample random lots
set.seed(123)
sampleC <- sample(levels(kilns_C$LOTNO),1)
sampleD <- sample(levels(kilns_D$LOTNO),1)
sampleE <- sample(levels(kilns_E$LOTNO),1)
sampleF <- sample(levels(kilns_F$LOTNO),1)
sampleG <- sample(levels(kilns_G$LOTNO),2)
sampleH <- sample(levels(kilns_H$LOTNO),2)
sample <- c(sampleC, sampleD, sampleE, sampleF)

# scale and shift values for second y-axis
sec_y_scale=20000
sec_y_shift=1500

# plot
press_joined %>% 
  dplyr::filter(LOTNO %in% sample) %>% 
  
  # KILN TEMP, SETPOINT, PRESSURE
  ggplot(aes(x=time))+
  geom_line(aes(y=avg_kiln_temp))+
  geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
  geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
  
  # PRESSURE POINTS
  geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
  geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
  geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
  geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
  geom_point(aes(y=press_less_00    * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
  
  # HLINES
  geom_hline(aes(yintercept = .0 * sec_y_scale + sec_y_shift),color='red')+
  geom_hline(aes(yintercept = .01 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
  geom_hline(aes(yintercept = .02 * sec_y_scale + sec_y_shift),color='green',linetype='dashed')+
  geom_hline(aes(yintercept = .03 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
    # mean/sd
  # geom_hline(aes(yintercept = mean_press * sec_y_scale + sec_y_shift),            color = 'blue',linetype='dotdash',size=1)+
  # geom_hline(aes(yintercept = (mean_press+sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
  # geom_hline(aes(yintercept = (mean_press-sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
  
  # PRESSURE RIBBONS
  # geom_ribbon(aes(ymin = press_less_00 * sec_y_scale + sec_y_shift, ymax = 0 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
  # geom_ribbon(aes(ymin = 0.00 * sec_y_scale + sec_y_shift, ymax = press_betw_00_01 * sec_y_scale +sec_y_shift),fill='yellow3',alpha=1)+
  # geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_betw_01_03 * sec_y_scale + sec_y_shift),fill='green',alpha=1)+
  # geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_greater_03 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
  # zoom on ribbons
  # scale_x_continuous(limits = c(0,18.4*60))+
  # scale_y_continuous(limits = c(1250,2350))
    # setpoint, temp AUC ribbon
  geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
  
  scale_y_continuous(name = "Kiln temp (F)",
                   breaks = sort(c(seq(0,3000,1000),400,600,1200)),
                   sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
                                       name = "Pressure",
                                       breaks = seq(-.1,.1,.01)
                                       )
                   )+
  scale_x_continuous(name = "Hours", 
                     labels = number_format(scale=1/60),
                     breaks = sort(c(seq(0,100*60,12*60)))
                     )+
  
  # labels
  # AUC
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(aucDiff),
             aes(x = 60 * 30, y = -0.065 * sec_y_scale + sec_y_shift,
                 label = paste0( "AUC: ", round(aucDiff,0) )
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='pink',label.size=.1
             ) +
  # MEAN
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(mean_press),
             aes(x = 60 * 30, y = -0.02 * sec_y_scale + sec_y_shift,
                 label = paste0( "mean: ", round(mean_press,3) )
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue',label.size=.1
             ) +
  # SD
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(sd_press),
             aes(x = 60 * 30, y = -0.03 * sec_y_scale + sec_y_shift,
                 label = paste0( "sd: ", round(sd_press,3) )
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue2',label.size=.1
             ) +
  # POS PRESS
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_pos, pct_time_at_pos, index_1200F),
             aes(x = 60 * 72, y = -0.04 * sec_y_scale + sec_y_shift,
                 label = paste0( time_at_pos, "m, ",
                                 mypercent(pct_time_at_pos))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey80',label.size=.1
  ) +
  # NEG PRESS
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_neg, pct_time_at_neg, index_1200F),
             aes(x = 60 * 72, y = -0.06 * sec_y_scale + sec_y_shift,
                 label = paste0( time_at_neg, "m, ",
                                 mypercent(pct_time_at_neg))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey90'
             )+
  # LESS 00
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_less_00, pct_time_less_00, index_1200F),
             aes(x = 60 * 72, y = -.02 * sec_y_scale + sec_y_shift,
                 label = paste0( time_less_00, "m, ",
                                 mypercent(pct_time_less_00))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
             )+
  # BETW 00 01
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_00_01, pct_time_betw_00_01, index_1200F),
             aes(x = 60 * 72, y = 0.0 * sec_y_scale + sec_y_shift,
                 label = paste0( time_betw_00_01, "m, ",
                                 mypercent(pct_time_betw_00_01))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='yellow3'
             ) +
  # BETW 01 02
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_01_02, pct_time_betw_01_02, index_1200F),
             aes(x = 60 * 72, y = .015 * sec_y_scale + sec_y_shift,
                 label = paste0( time_betw_01_02, "m, ",
                                 mypercent(pct_time_betw_01_02))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green'
             ) +
  # BETW 02 03
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_02_03, pct_time_betw_02_03, index_1200F),
             aes(x = 60 * 72, y = .025 * sec_y_scale + sec_y_shift,
                 label = paste0( time_betw_02_03, "m, ",
                                 mypercent(pct_time_betw_02_03))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green3'
             ) +
  # GREATER 03
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_greater_03, pct_time_greater_03, index_1200F),
             aes(x = 60 * 72, y = 0.04 * sec_y_scale + sec_y_shift,
                 label = paste0( time_greater_03, "m, ",
                                 mypercent(pct_time_greater_03))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
             ) +
  facet_wrap(~LOTNO)

2

# sample random lots
set.seed(8634)
sample <- sample(levels(kilns_All$LOTNO),4)

# plot
press_joined %>% 
  dplyr::filter(LOTNO %in% sample) %>% 
  
  # KILN TEMP, SETPOINT, PRESSURE
  ggplot(aes(x=time))+
  geom_line(aes(y=avg_kiln_temp))+
  geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
  geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
  
  # PRESSURE POINTS
  geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
  geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
  geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
  geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
  geom_point(aes(y=press_less_00    * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
  
  # HLINES
  geom_hline(aes(yintercept = .0 * sec_y_scale + sec_y_shift),color='red')+
  geom_hline(aes(yintercept = .01 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
  geom_hline(aes(yintercept = .02 * sec_y_scale + sec_y_shift),color='green',linetype='dashed')+
  geom_hline(aes(yintercept = .03 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
    # mean/sd
  # geom_hline(aes(yintercept = mean_press * sec_y_scale + sec_y_shift),            color = 'blue',linetype='dotdash',size=1)+
  # geom_hline(aes(yintercept = (mean_press+sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
  # geom_hline(aes(yintercept = (mean_press-sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
  
  # PRESSURE RIBBONS
  # geom_ribbon(aes(ymin = press_less_00 * sec_y_scale + sec_y_shift, ymax = 0 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
  # geom_ribbon(aes(ymin = 0.00 * sec_y_scale + sec_y_shift, ymax = press_betw_00_01 * sec_y_scale +sec_y_shift),fill='yellow3',alpha=1)+
  # geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_betw_01_03 * sec_y_scale + sec_y_shift),fill='green',alpha=1)+
  # geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_greater_03 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
  # zoom on ribbons
  # scale_x_continuous(limits = c(0,18.4*60))+
  # scale_y_continuous(limits = c(1250,2350))
    # setpoint, temp AUC ribbon
  geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
  
  scale_y_continuous(name = "Kiln temp (F)",
                   breaks = sort(c(seq(0,3000,1000),400,600,1200)),
                   sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
                                       name = "Pressure",
                                       breaks = seq(-.1,.1,.01)
                                       )
                   )+
  scale_x_continuous(name = "Hours", 
                     labels = number_format(scale=1/60),
                     breaks = sort(c(seq(0,100*60,12*60)))
                     )+
  
  # labels
  # AUC
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(aucDiff),
             aes(x = 60 * 30, y = -0.065 * sec_y_scale + sec_y_shift,
                 label = paste0( "AUC: ", round(aucDiff,0) )
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='pink',label.size=.1
             ) +
  # MEAN
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(mean_press),
             aes(x = 60 * 30, y = -0.02 * sec_y_scale + sec_y_shift,
                 label = paste0( "mean: ", round(mean_press,3) )
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue',label.size=.1
             ) +
  # SD
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(sd_press),
             aes(x = 60 * 30, y = -0.03 * sec_y_scale + sec_y_shift,
                 label = paste0( "sd: ", round(sd_press,3) )
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue2',label.size=.1
             ) +
  # POS PRESS
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_pos, pct_time_at_pos, index_1200F),
             aes(x = 60 * 72, y = -0.04 * sec_y_scale + sec_y_shift,
                 label = paste0( time_at_pos, "m, ",
                                 mypercent(pct_time_at_pos))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey80',label.size=.1
  ) +
  # NEG PRESS
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_neg, pct_time_at_neg, index_1200F),
             aes(x = 60 * 72, y = -0.06 * sec_y_scale + sec_y_shift,
                 label = paste0( time_at_neg, "m, ",
                                 mypercent(pct_time_at_neg))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey90'
             )+
  # LESS 00
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_less_00, pct_time_less_00, index_1200F),
             aes(x = 60 * 72, y = -.02 * sec_y_scale + sec_y_shift,
                 label = paste0( time_less_00, "m, ",
                                 mypercent(pct_time_less_00))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
             )+
  # BETW 00 01
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_00_01, pct_time_betw_00_01, index_1200F),
             aes(x = 60 * 72, y = 0.0 * sec_y_scale + sec_y_shift,
                 label = paste0( time_betw_00_01, "m, ",
                                 mypercent(pct_time_betw_00_01))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='yellow3'
             ) +
  # BETW 01 02
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_01_02, pct_time_betw_01_02, index_1200F),
             aes(x = 60 * 72, y = .015 * sec_y_scale + sec_y_shift,
                 label = paste0( time_betw_01_02, "m, ",
                                 mypercent(pct_time_betw_01_02))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green'
             ) +
  # BETW 02 03
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_02_03, pct_time_betw_02_03, index_1200F),
             aes(x = 60 * 72, y = .025 * sec_y_scale + sec_y_shift,
                 label = paste0( time_betw_02_03, "m, ",
                                 mypercent(pct_time_betw_02_03))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green3'
             ) +
  # GREATER 03
  geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_greater_03, pct_time_greater_03, index_1200F),
             aes(x = 60 * 72, y = 0.04 * sec_y_scale + sec_y_shift,
                 label = paste0( time_greater_03, "m, ",
                                 mypercent(pct_time_greater_03))
             ),
             label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
             ) +
  facet_wrap(~LOTNO)

Distribution of features

Distribution check of new features shows generally skewed and heavy-tailed data.

All features/kilns

# pivot data and plot histograms
press_summ %>%
  dplyr::select(-LOTNO) %>% 
  dplyr::select(time_at_pos:lot_yield) %>% 
  pivot_longer(cols = is.numeric) %>% 
  mutate_if(is.character, factor) %>% 
  ggplot(aes(x=value))+
  geom_freqpoly(bins=15)+
  facet_wrap(~name, scales='free')

mean_press

Mean pressure while temperature is less than 1200F is much higher for kilns G and H.

press_summ %>%
  mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>% 
  dplyr::select(-LOTNO) %>%
  dplyr::select(mean_press,kiln) %>% 
  ggplot(aes(x=mean_press))+
  geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
  geom_freqpoly(bins=15)+
  facet_wrap(~kiln)

time_less_00

press_summ %>%
  mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>% 
  dplyr::select(-LOTNO) %>%
  dplyr::select(time_less_00,kiln) %>% 
  ggplot(aes(x=time_less_00))+
  geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
  geom_freqpoly(bins=15)+
  facet_wrap(~kiln)

time_betw_00_01

press_summ %>%
  mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>% 
  dplyr::select(-LOTNO) %>%
  dplyr::select(time_betw_00_01,kiln) %>% 
  ggplot(aes(x=time_betw_00_01))+
  geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
  geom_freqpoly(bins=15)+
  facet_wrap(~kiln)

time_betw_01_02

Kilns G (especially) and H have far more time between .01 and .02 pressures.

press_summ %>%
  mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>% 
  dplyr::select(-LOTNO) %>%
  dplyr::select(time_betw_01_02,kiln) %>% 
  ggplot(aes(x=time_betw_01_02))+
  geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
  geom_freqpoly(bins=15)+
  facet_wrap(~kiln)

time_betw_02_03

Kilns G (especially) and H have far more time between .02 and .03 pressures.

press_summ %>%
  mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>% 
  dplyr::select(-LOTNO) %>%
  dplyr::select(time_betw_02_03,kiln) %>% 
  ggplot(aes(x=time_betw_02_03))+
  geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
  geom_freqpoly(bins=15)+
  facet_wrap(~kiln)

time_greater_03

press_summ %>%
  mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>% 
  dplyr::select(-LOTNO) %>%
  dplyr::select(time_greater_03,kiln) %>% 
  ggplot(aes(x=time_greater_03))+
  geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
  geom_freqpoly(bins=15)+
  facet_wrap(~kiln)

Lot yields analysis

Corrplots

Quick check of linear correlation of lot yields versus all features (including temps).
Only kiln E exhibits significant linear correlations (p > 0.05) with any pressure features.

C

No significant correlations.

M <- press_summ %>%
  dplyr::filter(kiln == "C") %>% 
  dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
  cor()
M <- M[-9,-9]
p.mat <- cor.mtest(M)
# corrplot(as.matrix(M[8,]), method="number",
#          p.mat = as.matrix(p.mat[8,])
#          , sig.level = 0.05)
corrplot(as.matrix(M[14,]), method="number",
         p.mat = as.matrix(p.mat[14,]), sig.level = 0.05)

D

Corr

Many Weak negative correlations, especially with time_betw_01_02, time_betw_02_03, and aucDiff.

M <- press_summ %>%
  dplyr::filter(kiln == "D") %>% 
  dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
  cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method="number",
         p.mat = as.matrix(p.mat[15])
         , sig.level = 0.05)

Explore

g1 <- press_summ %>%
  dplyr::filter(kiln == "D") %>% 
  ggplot(aes(x=time_betw_01_02, y=lot_yield))+
  geom_point()+
  geom_smooth(method="lm")
g2 <- press_summ %>%
  dplyr::filter(kiln == "D") %>% 
  ggplot(aes(x=time_betw_02_03, y=lot_yield))+
  geom_point()+
  geom_smooth(method="lm")
g3 <- press_summ %>%
  dplyr::filter(kiln == "D") %>% 
  ggplot(aes(x=aucDiff, y=lot_yield))+
  geom_point()+
  geom_smooth(method="lm")
grid.arrange(g1,g2,g3,nrow=1)

E

Corr

Strong negative correlations with sd_press, time_greater_03, and time_betw_02_03. As these values increase, lot yields decrease.

M <- press_summ %>%
  dplyr::filter(kiln == "E") %>% 
  dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
  cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
         p.mat = as.matrix(p.mat[15,])
         , sig.level = 0.05)

Explore

g1 <- press_summ %>%
  dplyr::filter(kiln == "E") %>% 
  ggplot(aes(x=sd_press, y=lot_yield))+
  geom_point()+
  geom_smooth(method='lm')
g2 <- press_summ %>%
  dplyr::filter(kiln == "E") %>% 
  ggplot(aes(x=time_greater_03, y=lot_yield,group=time_greater_03))+
  geom_point()+
  geom_smooth(method='lm')
grid.arrange(g1,g2,nrow=1)

F

No significant correlations.

M <- press_summ %>%
  dplyr::filter(kiln == "F") %>% 
  dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
  cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
         p.mat = as.matrix(p.mat[15,])
         , sig.level = 0.05)

G

No significant correlations.

M <- press_summ %>%
  dplyr::filter(kiln == "G") %>% 
  dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
  cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
         p.mat = as.matrix(p.mat[15,])
         , sig.level = 0.05)

H

No significant correlations.

M <- press_summ %>%
  dplyr::filter(kiln == "H") %>% 
  dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
  cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
         p.mat = as.matrix(p.mat[15,])
         , sig.level = 0.05)

Linear models

Glanced

\(R^2\) values in general are very poor with the exception of kiln E. Though most of these values are inflated due to training on the entire dataset rather than splitting into test and train sets.

# linear model on each kiln
df <- press_summ %>% 
  dplyr::select(-LOTNO,-time_at_pos,-time_at_neg, -temp_avg)

set.seed(234)
df_split <- initial_split(df)
df_train <- training(df_split)
df_test  <- testing(df_split)

lms <- df  %>% 
  nest(data = c(-kiln)) %>% 
  mutate(fit =       map(data, ~lm(lot_yield ~ ., data = .x)),
         tidied =    map(fit, tidy),
         glanced =   map(fit, glance),
         augmented = map(fit, augment)
         ) 

# glanced
lm_glance <- lms %>% 
  unnest(glanced) %>% 
  select(-data,-fit,-tidied,-augmented) %>% 
  select(kiln, r.squared,AIC,BIC) %>% 
  arrange(-r.squared) %>% 
  mutate_if(is.numeric, round, 5)

kable(lm_glance %>% 
        left_join(count(press_summ$kiln) %>% 
                    mutate(kiln = factor(x)) %>% 
                    select(kiln, freq)) %>% 
        select(kiln, freq, everything()),
      format="html") %>% 
  kable_styling(full_width=T)
kiln freq r.squared AIC BIC
E 15 0.99846 -89.84502 -79.22427
D 87 0.23664 -217.41242 -180.42380
C 53 0.21330 -128.16283 -100.57875
G 232 0.09174 -490.19878 -438.49772
H 182 0.08479 -303.27661 -255.21651
F 117 0.03784 -177.39467 -135.96207

Tidied

# tidied
lm_tidy <- lms %>% 
  unnest(tidied) %>% 
  select(-data,-fit,-glanced,-augmented) %>%
  arrange(kiln, p.value) %>% 
  mutate_if(is.numeric, round, 5) %>% 
  mutate(
    p.value = ifelse(p.value < .05, 
                     cell_spec(p.value,"html",color="red"),
                     cell_spec(p.value,"html",color="black"))
  )

C

kable(lm_tidy %>% dplyr::filter(kiln == "C"), format="html",escape = F) %>% 
  pack_rows( index=c("C" = 14)) %>% 
  kable_styling("striped", full_width=T)
kiln term estimate std.error statistic p.value
C
C (Intercept) 0.85120 0.09854 8.63805 0
C sd_press 11.56149 11.55899 1.00022 0.32322
C temp_max 0.00106 0.00111 0.95615 0.34474
C aucDiff 0.00000 0.00000 -0.83611 0.40806
C mean_press -8.00199 9.73070 -0.82235 0.41576
C precip -0.03041 0.04382 -0.69409 0.49164
C time_less_00 -0.00003 0.00006 -0.59942 0.55227
C time_betw_00_01 0.00003 0.00006 0.51597 0.60871
C temp_min 0.00060 0.00128 0.46870 0.64183
C snow_fall -0.00245 0.01420 -0.17259 0.86385
C snow_depth -0.00078 0.00453 -0.17238 0.86401
C time_betw_01_02 0.00026 0.00244 0.10564 0.9164
C time_betw_02_03 -0.00005 0.00717 -0.00764 0.99394
C time_greater_03 NA NA NA NA

D

kable(lm_tidy %>% dplyr::filter(kiln == "D"), format="html",escape = F) %>% 
  pack_rows( index=c("D" = 14)) %>% 
  kable_styling("striped", full_width=T)
kiln term estimate std.error statistic p.value
D
D (Intercept) 0.91481 0.05624 16.26736 0
D time_betw_02_03 -0.05878 0.02283 -2.57484 0.01205
D aucDiff 0.00000 0.00000 -2.18340 0.03222
D snow_fall 0.02275 0.01551 1.46668 0.14676
D time_less_00 0.00007 0.00006 1.35300 0.18023
D time_betw_01_02 -0.00128 0.00117 -1.08975 0.27941
D snow_depth -0.00339 0.00344 -0.98540 0.32768
D time_greater_03 0.01171 0.01829 0.64028 0.524
D sd_press -3.84453 7.07149 -0.54367 0.58833
D time_betw_00_01 0.00004 0.00009 0.48635 0.62817
D mean_press 4.34886 14.00702 0.31048 0.75708
D temp_min -0.00027 0.00105 -0.25494 0.79948
D precip 0.00619 0.02769 0.22368 0.82363
D temp_max 0.00008 0.00095 0.07914 0.93714

E

kable(lm_tidy %>% dplyr::filter(kiln == "E"), format="html",escape = F) %>% 
  pack_rows( index=c("E" = 14)) %>% 
  kable_styling("striped", full_width=T)
kiln term estimate std.error statistic p.value
E
E time_betw_01_02 0.00182 0.00018 10.32889 0.06144
E mean_press -108.85279 14.60849 -7.45134 0.08493
E time_betw_00_01 0.00078 0.00014 5.41257 0.11631
E temp_min 0.00823 0.00185 4.45509 0.14057
E temp_max -0.00621 0.00145 -4.27460 0.1463
E (Intercept) 0.58429 0.20431 2.85989 0.21414
E snow_depth 0.08660 0.03851 2.24871 0.26639
E snow_fall -0.18387 0.09017 -2.03903 0.29027
E time_less_00 0.00026 0.00013 2.00809 0.29414
E time_greater_03 -0.06790 0.03802 -1.78606 0.32493
E aucDiff 0.00000 0.00000 -0.93358 0.52186
E precip -0.17567 0.21740 -0.80802 0.56735
E sd_press -18.66761 32.50378 -0.57432 0.66811
E time_betw_02_03 0.00242 0.02054 0.11797 0.92525

F

kable(lm_tidy %>% dplyr::filter(kiln == "F"), format="html",escape = F) %>% 
  pack_rows( index=c("G" = 14)) %>% 
  kable_styling("striped", full_width=T)
kiln term estimate std.error statistic p.value
G
F (Intercept) 0.88886 0.09748 9.11789 0
F temp_max 0.00170 0.00129 1.32252 0.18892
F temp_min -0.00172 0.00140 -1.22340 0.22397
F aucDiff 0.00000 0.00000 0.75490 0.45203
F time_betw_01_02 0.00028 0.00046 0.61192 0.54194
F sd_press 2.99793 7.30504 0.41039 0.68237
F time_greater_03 0.00088 0.00237 0.37248 0.7103
F time_less_00 -0.00003 0.00008 -0.37204 0.71063
F time_betw_00_01 -0.00003 0.00008 -0.37046 0.7118
F snow_depth -0.00137 0.00484 -0.28324 0.77756
F snow_fall -0.00484 0.01971 -0.24561 0.80647
F precip -0.00835 0.04027 -0.20746 0.83606
F time_betw_02_03 -0.00017 0.00186 -0.09120 0.92751
F mean_press 0.26109 11.37128 0.02296 0.98173

G

kable(lm_tidy %>% dplyr::filter(kiln == "G"), format="html",escape = F) %>% 
  pack_rows( index=c("G" = 14)) %>% 
  kable_styling("striped", full_width=T)
kiln term estimate std.error statistic p.value
G
G (Intercept) 0.69377 0.07871 8.81458 0
G time_greater_03 -0.00060 0.00019 -3.08980 0.00226
G time_betw_00_01 0.00013 0.00005 2.65998 0.0084
G mean_press 14.07606 5.94712 2.36687 0.01881
G time_less_00 0.00016 0.00009 1.90628 0.05793
G sd_press 6.07222 3.25883 1.86331 0.06376
G aucDiff 0.00001 0.00000 1.24822 0.21329
G time_betw_01_02 -0.00006 0.00006 -0.95087 0.34273
G snow_fall -0.00502 0.00590 -0.85106 0.39567
G temp_max -0.00051 0.00066 -0.77430 0.43959
G time_betw_02_03 -0.00003 0.00010 -0.31542 0.75275
G snow_depth 0.00081 0.00303 0.26627 0.79028
G temp_min -0.00003 0.00075 -0.04422 0.96477
G precip -0.00067 0.02223 -0.03026 0.97589

H

kable(lm_tidy %>% dplyr::filter(kiln == "H"), format="html",escape = F) %>% 
  pack_rows( index=c("H" = 14)) %>% 
  kable_styling("striped", full_width=T)
kiln term estimate std.error statistic p.value
H
H (Intercept) 0.88829 0.14374 6.17995 0
H aucDiff 0.00000 0.00000 -2.91740 0.00401
H time_betw_01_02 0.00022 0.00014 1.63055 0.10486
H temp_max -0.00111 0.00102 -1.09009 0.27723
H time_betw_00_01 0.00011 0.00012 0.93691 0.35015
H mean_press -7.49653 8.88233 -0.84398 0.39988
H time_betw_02_03 0.00026 0.00031 0.83270 0.4062
H precip 0.01484 0.02994 0.49574 0.62073
H time_less_00 0.00006 0.00016 0.36109 0.71848
H snow_fall 0.00339 0.00948 0.35727 0.72134
H sd_press -2.37194 7.45432 -0.31820 0.75073
H snow_depth -0.00109 0.00398 -0.27476 0.78384
H time_greater_03 0.00011 0.00065 0.16109 0.87222
H temp_min 0.00015 0.00114 0.12964 0.89701

Augmented

Predicted values versus actual values.

# augmented
lms %>% unnest(augmented) %>% 
  select(-data,-fit,-tidied,-glanced) %>% 
  ggplot(aes(x=lot_yield, y=.fitted))+
  geom_pointdensity(show.legend = FALSE)+
  # geom_point()+
  geom_abline(lty=2,color='red')+
  facet_wrap(~kiln)+
  scale_y_continuous(limits=c(0.6,1))+
  scale_x_continuous(limits=c(0.6,1))+
  scale_color_viridis_c()

Variable importance

To get an idea of which variables are most important we can use LASSO regression to remove the least important variables. We can also test with a random forest variable importance implementation for comparison sake.

G

Linear

Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.

kable(lm_tidy %>% dplyr::filter(kiln == "G"), 
      format="html",
      caption = paste0("Significant variables by p-value, Model R^2: ",
                       round(lm_glance$r.squared[[4]], 3)),
      escape = F) %>%
  pack_rows( index=c("Kiln G" = 14)) %>%
  kable_styling("striped", full_width=T)
Significant variables by p-value, Model R^2: 0.092
kiln term estimate std.error statistic p.value
Kiln G
G (Intercept) 0.69377 0.07871 8.81458 0
G time_greater_03 -0.00060 0.00019 -3.08980 0.00226
G time_betw_00_01 0.00013 0.00005 2.65998 0.0084
G mean_press 14.07606 5.94712 2.36687 0.01881
G time_less_00 0.00016 0.00009 1.90628 0.05793
G sd_press 6.07222 3.25883 1.86331 0.06376
G aucDiff 0.00001 0.00000 1.24822 0.21329
G time_betw_01_02 -0.00006 0.00006 -0.95087 0.34273
G snow_fall -0.00502 0.00590 -0.85106 0.39567
G temp_max -0.00051 0.00066 -0.77430 0.43959
G time_betw_02_03 -0.00003 0.00010 -0.31542 0.75275
G snow_depth 0.00081 0.00303 0.26627 0.79028
G temp_min -0.00003 0.00075 -0.04422 0.96477
G precip -0.00067 0.02223 -0.03026 0.97589

Lasso

LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.

# original data
df <- press_summ %>% 
  dplyr::filter(kiln == "G") %>% 
  # dplyr::select(mean_press:lot_yield, -kiln)
  dplyr::select(precip:lot_yield, -kiln)

# splits
set.seed(323)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test  <- testing(pressure_split)

# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>% 
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  prep()

# lasso workflow
wf <- workflow() %>% 
  add_recipe(pressure_rec)

# lasso model specification
lasso_spec <- linear_reg(penalty = tune(), 
                         mixture = 1) %>% 
  set_engine("glmnet")

# generate folds for tuning
set.seed(1434)
lasso_folds <- vfold_cv(pressure_train)

# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)

# tune model
set.seed(211)
lasso_grid <- tune_grid(
  wf %>% add_model(lasso_spec),
  resamples = lasso_folds,
  grid = lambda_grid
)

# visualize tuning results
# lasso_grid %>% collect_metrics() %>%
#   ggplot(aes(penalty, mean, color=.metric))+
#   geom_errorbar(aes(ymin = mean - std_err,
#                     ymax = mean + std_err), alpha = .4)+
#   geom_point(size=2)+
#   geom_line(size=1.5)+
#   scale_x_log10()+
#   facet_wrap(~.metric,nrow=2,scales="free")

# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")

# finalize model wf
final_lasso_wf <- finalize_workflow(
  wf %>% add_model(lasso_spec),
  best_rsq
  # best_rmse
)

# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)

# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>% 
  fit(df) %>%
  # fit(pressure_train) %>% 
  pull_workflow_fit() %>% 
  vi(lambda = best_rsq$penalty) %>% 
  mutate(Importance = abs(Importance),
         Variable = fct_reorder(Variable, Importance)) %>% 
  ggplot(aes(x=Importance,y=Variable,fill=Sign))+
  geom_col()+
  labs(y=NULL)+
  scale_x_continuous(expand = c(0,0))+
  ggtitle(paste0("Variable importance using LASSO"))+
  labs(subtitle = paste0("Model R^2: ", round(r2,4)))

gg_lasso_importance

Random Forest

# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test  <- testing(pressure_split)

# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>% 
  prep()
  
# rf model specification
rf_spec <- rand_forest(mode="regression", 
                       mtry  = tune(), 
                       trees = 1000, 
                       min_n = tune()) %>% 
  set_engine("randomForest")

# workflow
wf <- workflow() %>% 
  add_recipe(pressure_rec)

# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)

# tune on non-specific grid
tune_res <- tune_grid(
  wf %>% add_model(rf_spec),
  resamples = rf_folds,
  grid = 20
  )

# visualize first tuning results
# tune_res %>% collect_metrics() %>%
#   filter(.metric == "rsq") %>%
#   select(mean, min_n, mtry) %>%
#   pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
#   ggplot(aes(value,mean,color=parameter))+
#   geom_point()+
#   geom_line()+
#   facet_wrap(~parameter, nrow=2, scales="free")

# generate more specific grid
rf_grid <- grid_regular(
  mtry(range = c(8,15)),
  min_n(range = c(15,35)),
  levels = 6
  )

# tune on more specific grid
regular_res <- tune_grid(
  wf %>% add_model(rf_spec),
  resamples = rf_folds,
  grid = rf_grid
  )

# visualize second tuning results
# regular_res %>% collect_metrics() %>%
#   filter(.metric == "rsq") %>%
#   mutate(min_n = as.factor(min_n)) %>%
#   ggplot(aes(mtry, mean, color = min_n))+
#   geom_point()+
#   geom_line(size=1.2)

# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")

# finalize model
final_rf <- finalize_model(
  rf_spec,
  best_rsq
)

# variable importance on training
# final_rf %>% 
#   set_engine("randomForest") %>% 
#   fit(lot_yield ~ ., juice(pressure_rec)) %>% 
#   vip(geom = "point")

# final random forest workflow
final_rf_wf <- workflow() %>% 
  add_recipe(pressure_rec) %>% 
  add_model(final_rf)

# fit model on test set
final_rf_res <- final_rf_wf %>% 
  last_fit(pressure_split)

# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)

# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>% 
#   ggplot(aes(.pred, lot_yield))+
#   geom_point()+geom_abline(lty=2,color='red')

# final model
# final_rf_model <- fit(final_rf_wf, df)

# variable importance on all data
gg_rf_importance <- final_rf %>% 
  set_engine("randomForest") %>% 
  fit(lot_yield ~ ., df) %>% 
  vip(geom = "point")+
  ggtitle(paste0("Variable importance using RandomForest"))+
  labs(subtitle = paste0("Model R^2: ", round(r2,4)))

gg_rf_importance

H

Linear

Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.

kable(lm_tidy %>% dplyr::filter(kiln == "H"), 
      format="html",
      caption = paste0("Significant variables by p-value, Model R^2: ",
                       round(lm_glance$r.squared[[5]], 3)),
      escape = F) %>%
  pack_rows( index=c("Kiln H" = 14)) %>%
  kable_styling("striped", full_width=T)
Significant variables by p-value, Model R^2: 0.085
kiln term estimate std.error statistic p.value
Kiln H
H (Intercept) 0.88829 0.14374 6.17995 0
H aucDiff 0.00000 0.00000 -2.91740 0.00401
H time_betw_01_02 0.00022 0.00014 1.63055 0.10486
H temp_max -0.00111 0.00102 -1.09009 0.27723
H time_betw_00_01 0.00011 0.00012 0.93691 0.35015
H mean_press -7.49653 8.88233 -0.84398 0.39988
H time_betw_02_03 0.00026 0.00031 0.83270 0.4062
H precip 0.01484 0.02994 0.49574 0.62073
H time_less_00 0.00006 0.00016 0.36109 0.71848
H snow_fall 0.00339 0.00948 0.35727 0.72134
H sd_press -2.37194 7.45432 -0.31820 0.75073
H snow_depth -0.00109 0.00398 -0.27476 0.78384
H time_greater_03 0.00011 0.00065 0.16109 0.87222
H temp_min 0.00015 0.00114 0.12964 0.89701

Lasso

LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.

# original data
df <- press_summ %>% 
  dplyr::filter(kiln == "H") %>% 
  # dplyr::select(mean_press:lot_yield, -kiln)
  dplyr::select(precip:lot_yield, -kiln)

# splits
set.seed(32)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test  <- testing(pressure_split)

# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>% 
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  prep()

# lasso workflow
wf <- workflow() %>% 
  add_recipe(pressure_rec)

# lasso model specification
lasso_spec <- linear_reg(penalty = tune(), 
                         mixture = 1) %>% 
  set_engine("glmnet")

# generate folds for tuning
set.seed(144)
lasso_folds <- vfold_cv(pressure_train)

# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)

# tune model
set.seed(211)
lasso_grid <- tune_grid(
  wf %>% add_model(lasso_spec),
  resamples = lasso_folds,
  grid = lambda_grid
)

# visualize tuning results
# lasso_grid %>% collect_metrics() %>% 
#   ggplot(aes(penalty, mean, color=.metric))+
#   geom_errorbar(aes(ymin = mean - std_err,
#                     ymax = mean + std_err), alpha = .4)+
#   geom_point(size=2)+
#   geom_line(size=1.5)+
#   scale_x_log10()+
#   facet_wrap(~.metric,nrow=2,scales="free")

# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")

# finalize model wf
final_lasso_wf <- finalize_workflow(
  wf %>% add_model(lasso_spec),
  best_rsq
  # best_rmse
)

# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)

# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>% 
  fit(df) %>%
  # fit(pressure_train) %>% 
  pull_workflow_fit() %>% 
  vi(lambda = best_rsq$penalty) %>% 
  mutate(Importance = abs(Importance),
         Variable = fct_reorder(Variable, Importance)) %>% 
  ggplot(aes(x=Importance,y=Variable,fill=Sign))+
  geom_col()+
  labs(y=NULL)+
  scale_x_continuous(expand = c(0,0))+
  ggtitle(paste0("Variable importance using LASSO"))+
  labs(subtitle = paste0("Model R^2: ", round(r2,4)))

gg_lasso_importance

Random Forest

# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test  <- testing(pressure_split)

# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>% 
  prep()
  
# rf model specification
rf_spec <- rand_forest(mode="regression", 
                       mtry  = tune(), 
                       trees = 1000, 
                       min_n = tune()) %>% 
  set_engine("randomForest")

# workflow
wf <- workflow() %>% 
  add_recipe(pressure_rec)

# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)

# tune on non-specific grid
tune_res <- tune_grid(
  wf %>% add_model(rf_spec),
  resamples = rf_folds,
  grid = 20
  )

# visualize first tuning results
# tune_res %>% collect_metrics() %>%
#   filter(.metric == "rsq") %>%
#   select(mean, min_n, mtry) %>%
#   pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
#   ggplot(aes(value,mean,color=parameter))+
#   geom_point()+
#   geom_line()+
#   facet_wrap(~parameter, nrow=2, scales="free")

# generate more specific grid
rf_grid <- grid_regular(
  mtry(range = c(1,5)),
  min_n(range = c(10,30)),
  levels = 7
  )

# tune on more specific grid
regular_res <- tune_grid(
  wf %>% add_model(rf_spec),
  resamples = rf_folds,
  grid = rf_grid
  )

# visualize second tuning results
# regular_res %>% collect_metrics() %>% 
#   filter(.metric == "rsq") %>% 
#   mutate(min_n = as.factor(min_n)) %>% 
#   ggplot(aes(mtry, mean, color = min_n))+
#   geom_point()+
#   geom_line(size=1.2)

# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")

# finalize model
final_rf <- finalize_model(
  rf_spec,
  best_rsq
)

# variable importance on training
# final_rf %>% 
#   set_engine("randomForest") %>% 
#   fit(lot_yield ~ ., juice(pressure_rec)) %>% 
#   vip(geom = "point")

# final random forest workflow
final_rf_wf <- workflow() %>% 
  add_recipe(pressure_rec) %>% 
  add_model(final_rf)

# fit model on test set
final_rf_res <- final_rf_wf %>% 
  last_fit(pressure_split)

# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)

# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>% 
#   ggplot(aes(.pred, lot_yield))+
#   geom_point()+geom_abline(lty=2,color='red')

# final model
# final_rf_model <- fit(final_rf_wf, df)

# variable importance on all data
gg_rf_importance <- final_rf %>% 
  set_engine("randomForest") %>% 
  fit(lot_yield ~ ., df) %>% 
  vip(geom = "point")+
  ggtitle(paste0("Variable importance using RandomForest"))+
  labs(subtitle = paste0("Model R^2: ", round(r2,4)))

gg_rf_importance

D

Linear

Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.

kable(lm_tidy %>% dplyr::filter(kiln == "D"), 
      format="html",
      caption = paste0("Significant variables by p-value, Model R^2: ",
                       round(lm_glance$r.squared[[2]], 3)),
      escape = F) %>%
  pack_rows( index=c("Kiln D" = 14)) %>%
  kable_styling("striped", full_width=T)
Significant variables by p-value, Model R^2: 0.237
kiln term estimate std.error statistic p.value
Kiln D
D (Intercept) 0.91481 0.05624 16.26736 0
D time_betw_02_03 -0.05878 0.02283 -2.57484 0.01205
D aucDiff 0.00000 0.00000 -2.18340 0.03222
D snow_fall 0.02275 0.01551 1.46668 0.14676
D time_less_00 0.00007 0.00006 1.35300 0.18023
D time_betw_01_02 -0.00128 0.00117 -1.08975 0.27941
D snow_depth -0.00339 0.00344 -0.98540 0.32768
D time_greater_03 0.01171 0.01829 0.64028 0.524
D sd_press -3.84453 7.07149 -0.54367 0.58833
D time_betw_00_01 0.00004 0.00009 0.48635 0.62817
D mean_press 4.34886 14.00702 0.31048 0.75708
D temp_min -0.00027 0.00105 -0.25494 0.79948
D precip 0.00619 0.02769 0.22368 0.82363
D temp_max 0.00008 0.00095 0.07914 0.93714

Lasso

LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.

# original data
df <- press_summ %>% 
  dplyr::filter(kiln == "D") %>% 
  # dplyr::select(mean_press:lot_yield, -kiln)
  dplyr::select(precip:lot_yield, -kiln)

# splits
set.seed(32)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test  <- testing(pressure_split)

# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>% 
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  prep()

# lasso workflow
wf <- workflow() %>% 
  add_recipe(pressure_rec)

# lasso model specification
lasso_spec <- linear_reg(penalty = tune(), 
                         mixture = 1) %>% 
  set_engine("glmnet")

# generate folds for tuning
set.seed(144)
lasso_folds <- vfold_cv(pressure_train)

# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)

# tune model
set.seed(211)
lasso_grid <- tune_grid(
  wf %>% add_model(lasso_spec),
  resamples = lasso_folds,
  grid = lambda_grid
)

# visualize tuning results
# lasso_grid %>% collect_metrics() %>% 
#   ggplot(aes(penalty, mean, color=.metric))+
#   geom_errorbar(aes(ymin = mean - std_err,
#                     ymax = mean + std_err), alpha = .4)+
#   geom_point(size=2)+
#   geom_line(size=1.5)+
#   scale_x_log10()+
#   facet_wrap(~.metric,nrow=2,scales="free")

# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")

# finalize model wf
final_lasso_wf <- finalize_workflow(
  wf %>% add_model(lasso_spec),
  best_rsq
  # best_rmse
)

# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)

# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>% 
  fit(df) %>%
  # fit(pressure_train) %>% 
  pull_workflow_fit() %>% 
  vi(lambda = best_rsq$penalty) %>% 
  mutate(Importance = abs(Importance),
         Variable = fct_reorder(Variable, Importance)) %>% 
  ggplot(aes(x=Importance,y=Variable,fill=Sign))+
  geom_col()+
  labs(y=NULL)+
  scale_x_continuous(expand = c(0,0))+
  ggtitle(paste0("Variable importance using LASSO"))+
  labs(subtitle = paste0("Model R^2: ", round(r2,4)))

gg_lasso_importance

Random Forest

# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test  <- testing(pressure_split)

# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>% 
  prep()
  
# rf model specification
rf_spec <- rand_forest(mode="regression", 
                       mtry  = tune(), 
                       trees = 1000, 
                       min_n = tune()) %>% 
  set_engine("randomForest")

# workflow
wf <- workflow() %>% 
  add_recipe(pressure_rec)

# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)

# tune on non-specific grid
tune_res <- tune_grid(
  wf %>% add_model(rf_spec),
  resamples = rf_folds,
  grid = 20
  )

# visualize first tuning results
# tune_res %>% collect_metrics() %>%
#   filter(.metric == "rsq") %>%
#   select(mean, min_n, mtry) %>%
#   pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
#   ggplot(aes(value,mean,color=parameter))+
#   geom_point()+
#   geom_line()+
#   facet_wrap(~parameter, nrow=2, scales="free")

# generate more specific grid
rf_grid <- grid_regular(
  min_n(range = c(1,20)),
  mtry(range = c(10,16)),
  levels = 6
  )

# tune on more specific grid
regular_res <- tune_grid(
  wf %>% add_model(rf_spec),
  resamples = rf_folds,
  grid = rf_grid
  )

# visualize second tuning results
# regular_res %>% collect_metrics() %>% 
#   filter(.metric == "rsq") %>% 
#   mutate(min_n = as.factor(min_n)) %>% 
#   ggplot(aes(mtry, mean, color = min_n))+
#   geom_point()+
#   geom_line(size=1.2)

# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")

# finalize model
final_rf <- finalize_model(
  rf_spec,
  best_rsq
)

# variable importance on training
# final_rf %>% 
#   set_engine("randomForest") %>% 
#   fit(lot_yield ~ ., juice(pressure_rec)) %>% 
#   vip(geom = "point")

# final random forest workflow
final_rf_wf <- workflow() %>% 
  add_recipe(pressure_rec) %>% 
  add_model(final_rf)

# fit model on test set
final_rf_res <- final_rf_wf %>% 
  last_fit(pressure_split)

# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)

# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>% 
#   ggplot(aes(.pred, lot_yield))+
#   geom_point()+geom_abline(lty=2,color='red')

# final model
# final_rf_model <- fit(final_rf_wf, df)

# variable importance on all data
gg_rf_importance <- final_rf %>% 
  set_engine("randomForest") %>% 
  fit(lot_yield ~ ., df) %>% 
  vip(geom = "point")+
  ggtitle(paste0("Variable importance using RandomForest"))+
  labs(subtitle = paste0("Model R^2: ", round(r2,4)))

gg_rf_importance

Item yields analysis

Rather than segregate into kilns, we group by item.

press_summ
## # A tibble: 686 x 19
##    LOTNO precip snow_fall snow_depth temp_max temp_min temp_avg time_at_pos
##    <chr>  <dbl>     <dbl>      <dbl>    <dbl>    <dbl>    <dbl>       <dbl>
##  1 0102~   0.02       0            0       37       22     29.5         132
##  2 0103~   0.08       0.2          0       35       23     29          1144
##  3 0103~   0          0            0       42       26     34           602
##  4 0103~   0          0            0       42       26     34          1021
##  5 0104~   0          0            4       27       -4     11.5         545
##  6 0104~   0          0            4       27       -4     11.5         307
##  7 0104~   0          0            4       27       -4     11.5         991
##  8 0104~   0          0            0       30       26     28           154
##  9 0104~   0          0            0       30       26     28           999
## 10 0104~   0.16       0            0       45       34     39.5        1204
## # ... with 676 more rows, and 11 more variables: time_at_neg <dbl>,
## #   mean_press <dbl>, sd_press <dbl>, time_greater_03 <dbl>,
## #   time_betw_02_03 <dbl>, time_betw_01_02 <dbl>, time_betw_00_01 <dbl>,
## #   time_less_00 <dbl>, aucDiff <dbl>, kiln <fct>, lot_yield <dbl>
df_merged
## # A tibble: 55,847 x 34
##    FIRE_DATE  month  year year_month LOTNO KILN    ITEM TYPE  EC    PPI  
##    <date>     <chr> <dbl> <chr>      <chr> <chr>  <dbl> <chr> <chr> <chr>
##  1 2016-01-02 Jan    2016 2016-Jan   0102~ B     854159 HF    None  10   
##  2 2016-01-02 Jan    2016 2016-Jan   0102~ B     852337 HF    None  10   
##  3 2016-01-02 Jan    2016 2016-Jan   0102~ G     853722 HF    FEC   10   
##  4 2016-01-02 Jan    2016 2016-Jan   0102~ G     853753 HF    FEC   10   
##  5 2016-01-02 Jan    2016 2016-Jan   0102~ G     853762 HF    FEC   15   
##  6 2016-01-02 Jan    2016 2016-Jan   0102~ G     853765 HF    None  30   
##  7 2016-01-02 Jan    2016 2016-Jan   0102~ G     853765 HF    None  30   
##  8 2016-01-02 Jan    2016 2016-Jan   0102~ G     853765 HF    None  30   
##  9 2016-01-02 Jan    2016 2016-Jan   0102~ G     853798 HF    SEC   30   
## 10 2016-01-02 Jan    2016 2016-Jan   0102~ G     853798 HF    SEC   30   
## # ... with 55,837 more rows, and 24 more variables: COMPOSITION <chr>,
## #   DESCRIPTION <chr>, CAUSE <chr>, reject_vol_single_row_D <dbl>,
## #   reject_cost_single_row_D <dbl>, TOTAL_ITEM_FIRED_Y <dbl>,
## #   total_item_count_fired_D <dbl>, TOTAL_ITEM_REJECTED_Y <dbl>,
## #   total_item_count_rejected_D <dbl>, total_item_pct_yield_Y <dbl>,
## #   total_item_cost_of_rejects_D <dbl>, total_item_fired_vol_D <dbl>,
## #   total_item_vol_fired_Y <dbl>, total_item_rejected_vol_D <dbl>,
## #   total_item_vol_rejected_Y <dbl>, cost_piece <dbl>, vol_piece <dbl>,
## #   COST_IN3 <dbl>, precip <dbl>, snow_fall <dbl>, snow_depth <dbl>,
## #   temp_max <dbl>, temp_min <dbl>, temp_avg <dbl>
# df yields join with press summ ... do pressures impact yields of items?


# df merged/defect join with press summ ... do pressures impact on occurence of defects?